Python Libraries and Modules¶

In [1]:
import os # Import the 'os' module
import numpy as np # NumPy for numerical operations
import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px # Plotly Express for interactive visualizations
import matplotlib.pyplot as plt # Graphing
import seaborn as sns # Graphing
import plotly.graph_objects as go # Library used for modified boxplot
from empiricaldist import Pmf, Cdf # Import Pmf and Cdf classes from the 'empiricaldist' module
from datetime import timedelta # Import the 'timedelta' class from the 'datetime' module
from sklearn.model_selection import train_test_split # Import 'train_test_split' from 'sklearn.model_selection', which is used to split a dataset into training and testing subsets
from sklearn.linear_model import LinearRegression # Import the 'LinearRegression' class from 'sklearn.linear_model', which is used to create a linear regression model
from sklearn.tree import DecisionTreeRegressor # Import the DecisionTreeRegressor class from the scikit-learn library
from sklearn.ensemble import RandomForestRegressor # Import the RandomForestRegressor class from the scikit-learn library
from sklearn.ensemble import GradientBoostingRegressor  # Import the GradientBoostingRegressor class from the scikit-learn library
from sklearn.metrics import mean_squared_error, mean_absolute_error # Import the 'mean_squared_error' and 'mean_absolute_error' functions from 'sklearn.metrics', which are used for evaluating regression models
from matplotlib.patches import Patch # Import the Patch class from the matplotlib.patches module
from copy import copy # Import the copy function from the copy module

Reading and Loading Dataset¶

In [2]:
# Read the CSV file located at the specified path into the 'dataset' DataFrame using pandas (pd)
dataset = pd.read_csv('C:/Users/Mohammad Navid/Desktop/US_AQI.csv') 

Data Prepocessing¶

Filtering Dataset for the Time Period from May 2020 - May 2022¶

In [3]:
# Filter the Data from May 2020 to May 2021"
filtered_data_1 = dataset[(dataset['Date'] >= '2020-05-01') & (dataset['Date'] <= '2021-05-31')]
# Filter the Data from June 2021 to May 2022"
filtered_data_2 = dataset[(dataset['Date'] >= '2021-06-01') & (dataset['Date'] <= '2022-05-31')]
# Filter the Data from May 2020 to May 2022"
full_data = dataset[(dataset['Date'] >= '2020-05-01') & (dataset['Date'] <= '2022-05-31')]
In [4]:
# Show full data table
full_data
Out[4]:
Unnamed: 0 CBSA Code Date AQI Category Defining Parameter Number of Sites Reporting city_ascii state_id state_name lat lng population density timezone
0 0 10140 2022-01-01 21 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571.0 588.0 America/Los_Angeles
1 1 10140 2022-01-02 12 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571.0 588.0 America/Los_Angeles
2 2 10140 2022-01-03 18 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571.0 588.0 America/Los_Angeles
3 3 10140 2022-01-04 19 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571.0 588.0 America/Los_Angeles
4 4 10140 2022-01-05 17 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571.0 588.0 America/Los_Angeles
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
359311 365463 49740 2020-12-27 32 Good Ozone 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 America/Phoenix
359312 365464 49740 2020-12-28 44 Good PM10 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 America/Phoenix
359313 365465 49740 2020-12-29 38 Good Ozone 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 America/Phoenix
359314 365466 49740 2020-12-30 36 Good Ozone 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 America/Phoenix
359315 365467 49740 2020-12-31 35 Good PM2.5 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 America/Phoenix

305077 rows × 15 columns

Creating Parameter Columns from AQI Values (filtered_data_1) and delete unnecessary columns¶

In [5]:
# Add the values from the \"AQI\" column to each parameter column
filtered_data_1['CO'] = np.where((filtered_data_1['Defining Parameter'] == 'CO'), filtered_data_1['AQI'], np.nan)
filtered_data_1['NO2'] = np.where((filtered_data_1['Defining Parameter'] == 'NO2'), filtered_data_1['AQI'], np.nan)
filtered_data_1['Ozone'] = np.where((filtered_data_1['Defining Parameter'] == 'Ozone'), filtered_data_1['AQI'], np.nan)
filtered_data_1['PM10'] = np.where((filtered_data_1['Defining Parameter'] == 'PM10'), filtered_data_1['AQI'], np.nan)
filtered_data_1['PM2.5'] = np.where((filtered_data_1['Defining Parameter'] == 'PM2.5'), filtered_data_1['AQI'], np.nan)

# Replace NaN to O from these columns
filtered_data_1['CO'] = filtered_data_1['CO'].fillna(0)
filtered_data_1['NO2'] = filtered_data_1['NO2'].fillna(0)
filtered_data_1['Ozone'] = filtered_data_1['Ozone'].fillna(0)
filtered_data_1['PM10'] = filtered_data_1['PM10'].fillna(0)
filtered_data_1['PM2.5'] = filtered_data_1['PM2.5'].fillna(0)

# Convert specific columns to string data type
filtered_data_1['CO'] = filtered_data_1['CO'].astype(str)  
filtered_data_1['NO2'] = filtered_data_1['NO2'].astype(str)  
filtered_data_1['Ozone'] = filtered_data_1['Ozone'].astype(str)   
filtered_data_1['PM10'] = filtered_data_1['PM10'].astype(str)  
filtered_data_1['PM2.5'] = filtered_data_1['PM2.5'].astype(str) 

# Drop unnecessary columns from filtered_data_1
filtered_data_1.drop(['Unnamed: 0'], axis=1, inplace=True)
filtered_data_1.drop(['Defining Parameter'], axis=1, inplace=True)
filtered_data_1.drop(['timezone'], axis=1, inplace=True)

# Removing 'District of Columbia' and 'Puerto Rico' from filtered_data_1
filtered_data_1 = filtered_data_1[~filtered_data_1['state_name'].isin(['District of Columbia', 'Puerto Rico'])]
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1['CO'] = np.where((filtered_data_1['Defining Parameter'] == 'CO'), filtered_data_1['AQI'], np.nan)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1['NO2'] = np.where((filtered_data_1['Defining Parameter'] == 'NO2'), filtered_data_1['AQI'], np.nan)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1['Ozone'] = np.where((filtered_data_1['Defining Parameter'] == 'Ozone'), filtered_data_1['AQI'], np.nan)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1['PM10'] = np.where((filtered_data_1['Defining Parameter'] == 'PM10'), filtered_data_1['AQI'], np.nan)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1['PM2.5'] = np.where((filtered_data_1['Defining Parameter'] == 'PM2.5'), filtered_data_1['AQI'], np.nan)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1['CO'] = filtered_data_1['CO'].fillna(0)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1['NO2'] = filtered_data_1['NO2'].fillna(0)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1['Ozone'] = filtered_data_1['Ozone'].fillna(0)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1['PM10'] = filtered_data_1['PM10'].fillna(0)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1['PM2.5'] = filtered_data_1['PM2.5'].fillna(0)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1['CO'] = filtered_data_1['CO'].astype(str)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1['NO2'] = filtered_data_1['NO2'].astype(str)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1['Ozone'] = filtered_data_1['Ozone'].astype(str)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:19: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1['PM10'] = filtered_data_1['PM10'].astype(str)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1['PM2.5'] = filtered_data_1['PM2.5'].astype(str)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1.drop(['Unnamed: 0'], axis=1, inplace=True)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:24: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1.drop(['Defining Parameter'], axis=1, inplace=True)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\4089107575.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_1.drop(['timezone'], axis=1, inplace=True)

Displaying Data for the Time Period May 2020 to May 2021¶

In [6]:
# Show date from May 2020 to May 2021
filtered_data_1
Out[6]:
CBSA Code Date AQI Category Number of Sites Reporting city_ascii state_id state_name lat lng population density CO NO2 Ozone PM10 PM2.5
25831 10100 2021-01-01 43 Good 1 Aberdeen SD South Dakota 45.4649 -98.4686 28315.0 661.0 0.0 0.0 0.0 0.0 43.0
25832 10100 2021-01-02 35 Good 1 Aberdeen SD South Dakota 45.4649 -98.4686 28315.0 661.0 0.0 0.0 0.0 0.0 35.0
25833 10100 2021-01-03 55 Moderate 1 Aberdeen SD South Dakota 45.4649 -98.4686 28315.0 661.0 0.0 0.0 0.0 0.0 55.0
25834 10100 2021-01-04 7 Good 1 Aberdeen SD South Dakota 45.4649 -98.4686 28315.0 661.0 0.0 0.0 0.0 0.0 7.0
25835 10100 2021-01-05 12 Good 1 Aberdeen SD South Dakota 45.4649 -98.4686 28315.0 661.0 0.0 0.0 0.0 0.0 12.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
359311 49740 2020-12-27 32 Good 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 0.0 0.0 32.0 0.0 0.0
359312 49740 2020-12-28 44 Good 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 0.0 0.0 0.0 44.0 0.0
359313 49740 2020-12-29 38 Good 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 0.0 0.0 38.0 0.0 0.0
359314 49740 2020-12-30 36 Good 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 0.0 0.0 36.0 0.0 0.0
359315 49740 2020-12-31 35 Good 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 0.0 0.0 0.0 0.0 35.0

179923 rows × 17 columns

Summary Statistics for filtered_data_1¶

In [7]:
# Summary Statistics of filtered_data_1
filtered_data_1.describe().style.background_gradient(cmap = "plasma")
Out[7]:
  CBSA Code AQI Number of Sites Reporting lat lng population density
count 179923.000000 179923.000000 179923.000000 179923.000000 179923.000000 179923.000000 179923.000000
mean 29966.728267 41.264936 3.384398 38.688751 -95.333500 454618.016768 999.460708
std 11357.713469 24.531330 4.928993 5.638516 16.910286 1372391.444181 919.182145
min 10100.000000 0.000000 1.000000 19.688300 -159.352100 1903.000000 4.000000
25% 19980.000000 29.000000 1.000000 34.933300 -109.220900 24479.000000 509.000000
50% 29620.000000 38.000000 2.000000 39.465300 -90.241200 98775.000000 777.000000
75% 40140.000000 48.000000 3.000000 42.444200 -82.151100 291388.000000 1183.000000
max 49740.000000 1250.000000 45.000000 64.835300 -68.790600 18680025.000000 10768.000000

Information Overview for filtered_data_1¶

In [8]:
# Information Overview of filtered_data_1
filtered_data_1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 179923 entries, 25831 to 359315
Data columns (total 17 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   CBSA Code                  179923 non-null  int64  
 1   Date                       179923 non-null  object 
 2   AQI                        179923 non-null  int64  
 3   Category                   179923 non-null  object 
 4   Number of Sites Reporting  179923 non-null  int64  
 5   city_ascii                 179923 non-null  object 
 6   state_id                   179923 non-null  object 
 7   state_name                 179923 non-null  object 
 8   lat                        179923 non-null  float64
 9   lng                        179923 non-null  float64
 10  population                 179923 non-null  float64
 11  density                    179923 non-null  float64
 12  CO                         179923 non-null  object 
 13  NO2                        179923 non-null  object 
 14  Ozone                      179923 non-null  object 
 15  PM10                       179923 non-null  object 
 16  PM2.5                      179923 non-null  object 
dtypes: float64(4), int64(3), object(10)
memory usage: 24.7+ MB

Shape of filtered_data_1¶

In [9]:
# Shape of filtered_data_1
filtered_data_1.shape
Out[9]:
(179923, 17)

Creating Parameter Columns from AQI Values (filtered_data_2) and delete unnecessary columns¶

In [10]:
# Add the values from the \"AQI\" column to each parameter column
filtered_data_2['CO'] = np.where((filtered_data_2['Defining Parameter'] == 'CO'), filtered_data_2['AQI'], np.nan)
filtered_data_2['NO2'] = np.where((filtered_data_2['Defining Parameter'] == 'NO2'), filtered_data_2['AQI'], np.nan)
filtered_data_2['Ozone'] = np.where((filtered_data_2['Defining Parameter'] == 'Ozone'), filtered_data_2['AQI'], np.nan)
filtered_data_2['PM10'] = np.where((filtered_data_2['Defining Parameter'] == 'PM10'), filtered_data_2['AQI'], np.nan)
filtered_data_2['PM2.5'] = np.where((filtered_data_2['Defining Parameter'] == 'PM2.5'), filtered_data_2['AQI'], np.nan)

# Replace NaN to O from these columns
filtered_data_2['CO'] = filtered_data_2['CO'].fillna(0)
filtered_data_2['NO2'] = filtered_data_2['NO2'].fillna(0)
filtered_data_2['Ozone'] = filtered_data_2['Ozone'].fillna(0)
filtered_data_2['PM10'] = filtered_data_2['PM10'].fillna(0)
filtered_data_2['PM2.5'] = filtered_data_2['PM2.5'].fillna(0)

# Convert specific columns to string data type
filtered_data_2['CO'] = filtered_data_2['CO'].astype(str)  
filtered_data_2['NO2'] = filtered_data_2['NO2'].astype(str)  
filtered_data_2['Ozone'] = filtered_data_2['Ozone'].astype(str)   
filtered_data_2['PM10'] = filtered_data_2['PM10'].astype(str)  
filtered_data_2['PM2.5'] = filtered_data_2['PM2.5'].astype(str)  

# Drop unnecessary columns from filtered_data_2
filtered_data_2.drop(['Unnamed: 0'], axis=1, inplace=True)
filtered_data_2.drop(['Defining Parameter'], axis=1, inplace=True)
filtered_data_2.drop(['timezone'], axis=1, inplace=True)

# Removing 'District of Columbia' and 'Puerto Rico' from filtered_data_2
filtered_data_2 = filtered_data_2[~filtered_data_2['state_name'].isin(['District of Columbia', 'Puerto Rico'])]
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['CO'] = np.where((filtered_data_2['Defining Parameter'] == 'CO'), filtered_data_2['AQI'], np.nan)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['NO2'] = np.where((filtered_data_2['Defining Parameter'] == 'NO2'), filtered_data_2['AQI'], np.nan)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['Ozone'] = np.where((filtered_data_2['Defining Parameter'] == 'Ozone'), filtered_data_2['AQI'], np.nan)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['PM10'] = np.where((filtered_data_2['Defining Parameter'] == 'PM10'), filtered_data_2['AQI'], np.nan)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['PM2.5'] = np.where((filtered_data_2['Defining Parameter'] == 'PM2.5'), filtered_data_2['AQI'], np.nan)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['CO'] = filtered_data_2['CO'].fillna(0)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['NO2'] = filtered_data_2['NO2'].fillna(0)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['Ozone'] = filtered_data_2['Ozone'].fillna(0)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['PM10'] = filtered_data_2['PM10'].fillna(0)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['PM2.5'] = filtered_data_2['PM2.5'].fillna(0)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['CO'] = filtered_data_2['CO'].astype(str)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['NO2'] = filtered_data_2['NO2'].astype(str)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['Ozone'] = filtered_data_2['Ozone'].astype(str)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:19: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['PM10'] = filtered_data_2['PM10'].astype(str)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2['PM2.5'] = filtered_data_2['PM2.5'].astype(str)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2.drop(['Unnamed: 0'], axis=1, inplace=True)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:24: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2.drop(['Defining Parameter'], axis=1, inplace=True)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2543847843.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data_2.drop(['timezone'], axis=1, inplace=True)

Displaying Data for the Time Period June 2021 to May 2022¶

In [11]:
# Show date from June 2021 to May 2022
filtered_data_2
Out[11]:
CBSA Code Date AQI Category Number of Sites Reporting city_ascii state_id state_name lat lng population density CO NO2 Ozone PM10 PM2.5
0 10140 2022-01-01 21 Good 2 Aberdeen WA Washington 46.9757 -123.8094 16571.0 588.0 0.0 0.0 0.0 0.0 21.0
1 10140 2022-01-02 12 Good 2 Aberdeen WA Washington 46.9757 -123.8094 16571.0 588.0 0.0 0.0 0.0 0.0 12.0
2 10140 2022-01-03 18 Good 2 Aberdeen WA Washington 46.9757 -123.8094 16571.0 588.0 0.0 0.0 0.0 0.0 18.0
3 10140 2022-01-04 19 Good 2 Aberdeen WA Washington 46.9757 -123.8094 16571.0 588.0 0.0 0.0 0.0 0.0 19.0
4 10140 2022-01-05 17 Good 2 Aberdeen WA Washington 46.9757 -123.8094 16571.0 588.0 0.0 0.0 0.0 0.0 17.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
192459 49740 2021-12-27 34 Good 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 0.0 0.0 0.0 0.0 34.0
192460 49740 2021-12-28 34 Good 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 0.0 0.0 34.0 0.0 0.0
192461 49740 2021-12-29 35 Good 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 0.0 0.0 35.0 0.0 0.0
192462 49740 2021-12-30 27 Good 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 0.0 0.0 27.0 0.0 0.0
192463 49740 2021-12-31 27 Good 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 0.0 0.0 27.0 0.0 0.0

122445 rows × 17 columns

Summary Statistics for filtered_data_2¶

In [12]:
# Summary Statistics of filtered_data_2
filtered_data_2.describe().style.background_gradient(cmap = "plasma")
Out[12]:
  CBSA Code AQI Number of Sites Reporting lat lng population density
count 122445.000000 122445.000000 122445.000000 122445.000000 122445.000000 122445.000000 122445.000000
mean 29933.876189 42.221438 3.199779 38.487797 -95.242031 483641.387986 1002.417240
std 11381.454986 23.981390 4.611150 5.560619 16.597819 1428792.594744 946.266079
min 10100.000000 0.000000 1.000000 19.688300 -159.352100 1903.000000 4.000000
25% 19780.000000 29.000000 1.000000 34.617500 -108.567300 25290.000000 508.000000
50% 29540.000000 38.000000 2.000000 39.305100 -90.468100 107460.000000 776.000000
75% 40140.000000 50.000000 3.000000 42.270500 -82.197700 316743.000000 1188.000000
max 49740.000000 775.000000 44.000000 64.835300 -68.790600 18680025.000000 10768.000000

Information Overview for filtered_data_2¶

In [13]:
# Information Overview of filtered_data_2
filtered_data_2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 122445 entries, 0 to 192463
Data columns (total 17 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   CBSA Code                  122445 non-null  int64  
 1   Date                       122445 non-null  object 
 2   AQI                        122445 non-null  int64  
 3   Category                   122445 non-null  object 
 4   Number of Sites Reporting  122445 non-null  int64  
 5   city_ascii                 122445 non-null  object 
 6   state_id                   122445 non-null  object 
 7   state_name                 122445 non-null  object 
 8   lat                        122445 non-null  float64
 9   lng                        122445 non-null  float64
 10  population                 122445 non-null  float64
 11  density                    122445 non-null  float64
 12  CO                         122445 non-null  object 
 13  NO2                        122445 non-null  object 
 14  Ozone                      122445 non-null  object 
 15  PM10                       122445 non-null  object 
 16  PM2.5                      122445 non-null  object 
dtypes: float64(4), int64(3), object(10)
memory usage: 16.8+ MB

Shape of filtered_data_2¶

In [14]:
# Shape of filtered_data_2
filtered_data_2.shape
Out[14]:
(122445, 17)

Creating Parameter Columns from AQI Values (full_data) and delete unnecessary columns¶

In [15]:
# Add the values from the \"AQI\" column to each parameter column
full_data['CO'] = np.where((full_data['Defining Parameter'] == 'CO'), full_data['AQI'], np.nan)
full_data['NO2'] = np.where((full_data['Defining Parameter'] == 'NO2'), full_data['AQI'], np.nan)
full_data['Ozone'] = np.where((full_data['Defining Parameter'] == 'Ozone'), full_data['AQI'], np.nan)
full_data['PM10'] = np.where((full_data['Defining Parameter'] == 'PM10'), full_data['AQI'], np.nan)
full_data['PM2.5'] = np.where((full_data['Defining Parameter'] == 'PM2.5'), full_data['AQI'], np.nan)

# Replace NaN to O from these columns
full_data['CO'] = full_data['CO'].fillna(0)
full_data['NO2'] = full_data['NO2'].fillna(0)
full_data['Ozone'] = full_data['Ozone'].fillna(0)
full_data['PM10'] = full_data['PM10'].fillna(0)
full_data['PM2.5'] = full_data['PM2.5'].fillna(0)

# Convert specific columns to string data type
full_data['CO'] = full_data['CO'].astype(str)  
full_data['NO2'] = full_data['NO2'].astype(str)  
full_data['Ozone'] = full_data['Ozone'].astype(str)   
full_data['PM10'] = full_data['PM10'].astype(str)  
full_data['PM2.5'] = full_data['PM2.5'].astype(str)  

# Drop unnecessary columns from full_data
full_data.drop(['Unnamed: 0'], axis=1, inplace=True)
full_data.drop(['timezone'], axis=1, inplace=True)

# Removing 'District of Columbia' and 'Puerto Rico' from full_data
full_data = full_data[~full_data['state_name'].isin(['District of Columbia', 'Puerto Rico'])]
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_data['CO'] = np.where((full_data['Defining Parameter'] == 'CO'), full_data['AQI'], np.nan)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_data['NO2'] = np.where((full_data['Defining Parameter'] == 'NO2'), full_data['AQI'], np.nan)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_data['Ozone'] = np.where((full_data['Defining Parameter'] == 'Ozone'), full_data['AQI'], np.nan)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_data['PM10'] = np.where((full_data['Defining Parameter'] == 'PM10'), full_data['AQI'], np.nan)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_data['PM2.5'] = np.where((full_data['Defining Parameter'] == 'PM2.5'), full_data['AQI'], np.nan)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_data['CO'] = full_data['CO'].fillna(0)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_data['NO2'] = full_data['NO2'].fillna(0)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_data['Ozone'] = full_data['Ozone'].fillna(0)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_data['PM10'] = full_data['PM10'].fillna(0)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_data['PM2.5'] = full_data['PM2.5'].fillna(0)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_data['CO'] = full_data['CO'].astype(str)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_data['NO2'] = full_data['NO2'].astype(str)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_data['Ozone'] = full_data['Ozone'].astype(str)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:19: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_data['PM10'] = full_data['PM10'].astype(str)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_data['PM2.5'] = full_data['PM2.5'].astype(str)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_data.drop(['Unnamed: 0'], axis=1, inplace=True)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\2661443164.py:24: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_data.drop(['timezone'], axis=1, inplace=True)

Displaying Data for the Time Period May 2020 to May 2022¶

In [16]:
# Show date from May 2020 to May 2022
full_data
Out[16]:
CBSA Code Date AQI Category Defining Parameter Number of Sites Reporting city_ascii state_id state_name lat lng population density CO NO2 Ozone PM10 PM2.5
0 10140 2022-01-01 21 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571.0 588.0 0.0 0.0 0.0 0.0 21.0
1 10140 2022-01-02 12 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571.0 588.0 0.0 0.0 0.0 0.0 12.0
2 10140 2022-01-03 18 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571.0 588.0 0.0 0.0 0.0 0.0 18.0
3 10140 2022-01-04 19 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571.0 588.0 0.0 0.0 0.0 0.0 19.0
4 10140 2022-01-05 17 Good PM2.5 2 Aberdeen WA Washington 46.9757 -123.8094 16571.0 588.0 0.0 0.0 0.0 0.0 17.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
359311 49740 2020-12-27 32 Good Ozone 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 0.0 0.0 32.0 0.0 0.0
359312 49740 2020-12-28 44 Good PM10 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 0.0 0.0 0.0 44.0 0.0
359313 49740 2020-12-29 38 Good Ozone 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 0.0 0.0 38.0 0.0 0.0
359314 49740 2020-12-30 36 Good Ozone 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 0.0 0.0 36.0 0.0 0.0
359315 49740 2020-12-31 35 Good PM2.5 1 Yuma AZ Arizona 32.5995 -114.5491 137612.0 311.0 0.0 0.0 0.0 0.0 35.0

302368 rows × 18 columns

Summary Statistics for full_data¶

In [17]:
# Summary Statistics of full_data
full_data.describe().style.background_gradient(cmap = "plasma")
Out[17]:
  CBSA Code AQI Number of Sites Reporting lat lng population density
count 302368.000000 302368.000000 302368.000000 302368.000000 302368.000000 302368.000000 302368.000000
mean 29953.424701 41.652275 3.309636 38.607374 -95.296459 466371.134455 1000.657966
std 11367.326282 24.314623 4.803664 5.607960 16.784485 1395576.343727 930.244469
min 10100.000000 0.000000 1.000000 19.688300 -159.352100 1903.000000 4.000000
25% 19820.000000 29.000000 1.000000 34.769000 -109.220900 24834.000000 508.000000
50% 29620.000000 38.000000 2.000000 39.379700 -90.245100 101326.000000 776.000000
75% 40140.000000 49.000000 3.000000 42.337200 -82.151100 300268.000000 1183.000000
max 49740.000000 1250.000000 45.000000 64.835300 -68.790600 18680025.000000 10768.000000

Information Overview for full_data¶

In [18]:
# Information Overview of full_data
full_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 302368 entries, 0 to 359315
Data columns (total 18 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   CBSA Code                  302368 non-null  int64  
 1   Date                       302368 non-null  object 
 2   AQI                        302368 non-null  int64  
 3   Category                   302368 non-null  object 
 4   Defining Parameter         302368 non-null  object 
 5   Number of Sites Reporting  302368 non-null  int64  
 6   city_ascii                 302368 non-null  object 
 7   state_id                   302368 non-null  object 
 8   state_name                 302368 non-null  object 
 9   lat                        302368 non-null  float64
 10  lng                        302368 non-null  float64
 11  population                 302368 non-null  float64
 12  density                    302368 non-null  float64
 13  CO                         302368 non-null  object 
 14  NO2                        302368 non-null  object 
 15  Ozone                      302368 non-null  object 
 16  PM10                       302368 non-null  object 
 17  PM2.5                      302368 non-null  object 
dtypes: float64(4), int64(3), object(11)
memory usage: 43.8+ MB

Shape of full_data¶

In [19]:
# Shape of full_data
full_data.shape
Out[19]:
(302368, 18)

Check Missing Values in Dataset¶

In [20]:
# Show the count of missing values in each column
full_data.isnull().sum() 
Out[20]:
CBSA Code                    0
Date                         0
AQI                          0
Category                     0
Defining Parameter           0
Number of Sites Reporting    0
city_ascii                   0
state_id                     0
state_name                   0
lat                          0
lng                          0
population                   0
density                      0
CO                           0
NO2                          0
Ozone                        0
PM10                         0
PM2.5                        0
dtype: int64

Finding Air Quality Index(AQI) values greater than 500 [Beyond the AQI]¶

In [21]:
limit1 = 500 # maximum limit of aqi
date_column1 = full_data['Date'] # assign Date column from dataset into date_column variable
sn_column1 = full_data['state_name'] # assign state_name column from dataset into sn_column variable
aqi_column1 = full_data['AQI'] # assign aqi column from dataset into aqi_column variable
count_values1 = aqi_column1[aqi_column1 > limit1].count() # count the total number of AQI values
aqi_date1 = date_column1[aqi_column1 > limit1] # find date whose limit is greater than 500
aqi_state_name1 = sn_column1[aqi_column1 > limit1]
aqi_values1 = aqi_column1[aqi_column1 > limit1] # find AQI values whose limit is greater than 500

print('Total Number of AQI Values>500 from May 2020 to May 2022:', count_values1) # print the total number of AQI values
print('--------------------------------------------------------------') # print broken line

# Creating a DataFrame for outlier values of AQI
outlier_data = {
    'Date': aqi_date1,
    'State Name': aqi_state_name1,
    'AQI': aqi_values1
}
outlier_df = pd.DataFrame(outlier_data)

# Display the DataFrame without the index
print(outlier_df.to_string(index=False))
Total Number of AQI Values>500 from May 2020 to May 2022: 33
--------------------------------------------------------------
      Date State Name  AQI
2022-02-23 New Mexico  775
2022-05-29 California  733
2022-04-11 California  644
2022-05-29 California  631
2021-10-11 California  552
2021-10-11 California  666
2021-03-16 New Mexico  665
2021-09-14 California  537
2020-09-08     Oregon  502
2020-09-11     Oregon  583
2020-09-12     Oregon  506
2020-09-08 California  684
2020-09-10     Oregon  510
2020-09-12     Oregon  550
2020-09-13     Oregon  502
2020-09-08 California  543
2020-09-07 Washington  908
2020-09-12     Oregon  679
2020-09-12     Oregon  517
2020-08-16    Arizona 1124
2020-11-07    Arizona  520
2020-09-13     Oregon  509
2020-09-12     Oregon  561
2020-06-05 California  576
2020-05-19    Wyoming 1250
2020-09-13     Oregon  518
2020-09-09     Oregon  550
2020-09-11     Oregon  641
2020-09-12     Oregon  526
2020-09-13     Oregon  548
2020-09-13 Washington  536
2020-09-07 Washington  819
2020-09-12 Washington  519

Converting Date Column to Datetime Format of Filtered Data 1 & 2¶

In [22]:
col = 'Date' # select column

# Converting Date Column to Datetime Format of Filtered Data 1
filtered_data_1[col] = pd.to_datetime(filtered_data_1[col], format='%Y-%m-%d') 
# Converting Date Column to Datetime Format of Filtered Data 2
filtered_data_2[col] = pd.to_datetime(filtered_data_2[col], format='%Y-%m-%d')

Exploring Air Quality and U.S. States¶

In [23]:
print("Categories of Air Quality:\n",full_data.Category.unique()) # print AQI category
print("")# print new line

states_name = sorted(full_data.state_name.unique()) # sort state name
print("Name of the U.S. State:\n",sorted(states_name))# print statename in sorted
print("")# print new line

num_of_us_states = len(states_name) # length of state name
print("Total Number of the U.S. State:",num_of_us_states) # print length of state name
Categories of Air Quality:
 ['Good' 'Moderate' 'Unhealthy for Sensitive Groups' 'Unhealthy'
 'Hazardous' 'Very Unhealthy']

Name of the U.S. State:
 ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

Total Number of the U.S. State: 50

Data Visualization¶

Line Graphs of Air Quality in the US¶

In [24]:
fig1 = px.line(filtered_data_1, x="Date", y="AQI", color="state_name", title='Line graph of Air quality in the US from May 2020 to May 2021')
fig1.show() # Line graph of Air quality in the US from May 2020 to May 2021

fig2 = px.line(filtered_data_2, x="Date", y="AQI", color="state_name", title='Line graph of Air quality in the US from June 2021 to May 2022')
fig2.show() # Line graph of Air quality in the US from June 2021 to May 2022

Time Series Plot of US AQI (May 2020 - May 2021)¶

In [25]:
# Air Quality Index (AQI) Categories - Color-Coded Rectangles
GOOD = plt.Rectangle((-1_000,0), 17_500, 50, fc='green',ec="green", alpha=0.25)
MODERATE = plt.Rectangle((-1_000,50), 17_500, 50, fc='yellow',ec="yellow", alpha=0.25)
UNHEALTHY_FOR_SENSITIVE = plt.Rectangle((-1_000,100), 17_500, 50,fc='orange',ec="orange", alpha=0.25)
UNHEALTHY = plt.Rectangle((-1_000,150), 17_500, 50, fc='red',ec="red", alpha=0.25)
VERY_UNHEALTHY = plt.Rectangle((-1_000,200), 17_500, 100, fc='purple',ec="purple", alpha=0.25)
HAZARDOUS = plt.Rectangle((-1_000,300), 17_500, 200, fc='maroon',ec="maroon", alpha=0.25)

# plots time series of US AQI
df1 = filtered_data_1.groupby('Date').AQI.mean().loc[:'2021-05-31']

df1.plot(figsize=(12,6), color='black', alpha=0.75)
plt.title('Time Series Plot of US AQI (May 2020 - May 2021)', {'size': 25, 'weight': 'bold'})

# Adds color patches 
plt.gca().add_patch(copy(GOOD))
plt.gca().add_patch(copy(MODERATE))
plt.gca().add_patch(copy(UNHEALTHY_FOR_SENSITIVE))
plt.gca().add_patch(copy(UNHEALTHY))
plt.gca().add_patch(copy(VERY_UNHEALTHY))
plt.gca().add_patch(copy(HAZARDOUS))

# x and y axis labeling
plt.xlabel('Date', {'size': 14, 'weight': 'bold'})
plt.ylabel('AQI', {'size': 14, 'weight': 'bold'})

# Set limit of y axis
plt.ylim(0, 250)

#Sets up legend
legend_elements = [
    Patch(facecolor='green', edgecolor='black', alpha=0.5, label='Good (0-50)'),
    Patch(facecolor='yellow', edgecolor='black', alpha=0.5, label='Moderate (51-100)'),
    Patch(facecolor='orange', edgecolor='black', alpha=0.5, label='Unhealthy for Sensitive Groups (101-150)'),
    Patch(facecolor='red', edgecolor='black', alpha=0.5, label='Unhealthy (151-200)'),
    Patch(facecolor='purple', edgecolor='black', alpha=0.5, label='Very Unhealthy (201-300)'),
    Patch(facecolor='maroon', edgecolor='black', alpha=0.5, label='Hazardous (301-500)'),
                  ]

legend = plt.legend(handles=legend_elements)
legend.set_title("AQI Category", prop = {'size':12, 'weight': 'bold'})

#displays the plot
plt.show()

Time Series Plot of US AQI (June 2021 - May 2022)¶

In [26]:
#Air Quality Index (AQI) Categories - Color-Coded Rectangles
GOOD = plt.Rectangle((-1_000,0), 17_500, 50, fc='green',ec="green", alpha=0.25)
MODERATE = plt.Rectangle((-1_000,50), 17_500, 50, fc='yellow',ec="yellow", alpha=0.25)
UNHEALTHY_FOR_SENSITIVE = plt.Rectangle((-1_000,100), 17_500, 50,fc='orange',ec="orange", alpha=0.25)
UNHEALTHY = plt.Rectangle((-1_000,150), 17_500, 50, fc='red',ec="red", alpha=0.25)
VERY_UNHEALTHY = plt.Rectangle((-1_000,200), 17_500, 100, fc='purple',ec="purple", alpha=0.25)
HAZARDOUS = plt.Rectangle((-1_000,300), 17_500, 200, fc='maroon',ec="maroon", alpha=0.25)

# plots time series of US average AQI
df2 = filtered_data_2.groupby('Date').AQI.mean().loc[:'2022-05-31']

df2.plot(figsize=(12,6), color='black', alpha=0.75)
plt.title('Time Series Plot of US AQI (June 2021 - May 2022)', {'size': 25, 'weight': 'bold'})

# Adds color patches 
plt.gca().add_patch(copy(GOOD))
plt.gca().add_patch(copy(MODERATE))
plt.gca().add_patch(copy(UNHEALTHY_FOR_SENSITIVE))
plt.gca().add_patch(copy(UNHEALTHY))
plt.gca().add_patch(copy(VERY_UNHEALTHY))
plt.gca().add_patch(copy(HAZARDOUS))

# x and y axis labeling
plt.xlabel('Date', {'size': 14, 'weight': 'bold'})
plt.ylabel('AQI', {'size': 14, 'weight': 'bold'})

# Set limit of y axis
plt.ylim(0, 250)

#Sets up legend
legend_elements = [
    Patch(facecolor='green', edgecolor='black', alpha=0.5, label='Good (0-50)'),
    Patch(facecolor='yellow', edgecolor='black', alpha=0.5, label='Moderate (51-100)'),
    Patch(facecolor='orange', edgecolor='black', alpha=0.5, label='Unhealthy for Sensitive Groups (101-150)'),
    Patch(facecolor='red', edgecolor='black', alpha=0.5, label='Unhealthy (151-200)'),
    Patch(facecolor='purple', edgecolor='black', alpha=0.5, label='Very Unhealthy (201-300)'),
    Patch(facecolor='maroon', edgecolor='black', alpha=0.5, label='Hazardous (301-500)'),
                  ]

legend = plt.legend(handles=legend_elements)
legend.set_title("AQI Category", prop = {'size':12, 'weight': 'bold'})

#displays the plot
plt.show()

Setting Seaborn Parameters and Matplotlib Style¶

In [27]:
sns.set(rc = {"figure.figsize":(12, 6)}) #graph size
plt.style.use("fivethirtyeight")

Overall Count Plot of Good AQI Values (0-50) in the United States¶

In [28]:
# bar plot of overall good aqi values (0-50) in US from May 2020-May 2021
plot = sns.countplot(data = filtered_data_1, x = "AQI", color = "#00DB16")
plt.title("Overall Count of Good AQI Values (AQI 0-50) in the USA from May 2020-May 2021")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(0, 51)
plt.figure(figsize = (10, 12))
plt.show() 
<Figure size 1000x1200 with 0 Axes>
In [29]:
# bar plot of overall good aqi values (0-50) in US from June 2021-May 2022
plot = sns.countplot(data = filtered_data_2, x = "AQI", color = "#00DB16")
plt.title("Overall Count of Good AQI Values (AQI 0-50) in the USA from June 2021-May 2022")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(0, 51)
plt.figure(figsize = (10, 12))
plt.show() 
<Figure size 1000x1200 with 0 Axes>

Overall Count Plot of Moderate AQI Values (50-100) in the United States¶

In [30]:
# bar plot of overall moderate aqi values (50-100) in US from May 2020-May 2021
plot = sns.countplot(data = filtered_data_1, x = "AQI", color = "#F3FC00")
plt.title("Overall Count of Moderate AQI Values (AQI 50-100) in the USA from May 2020-May 2021")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(50, 101)
plt.figure(figsize = (10, 12))
plt.show()
<Figure size 1000x1200 with 0 Axes>
In [31]:
# bar plot of overall moderate aqi values (50-100) in US from June 2021-May 2022
plot = sns.countplot(data = filtered_data_2, x = "AQI", color = "#F3FC00")
plt.title("Overall Count of Moderate AQI Values (AQI 50-100) in the USA from June 2021-May 2022")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(50, 101)
plt.figure(figsize = (10, 12))
plt.show()
<Figure size 1000x1200 with 0 Axes>

Overall Count Plot of Unhealthy for Sensitive Groups AQI Values (50-100) in the United States¶

In [32]:
# bar plot of overall moderate aqi values (50-100) in US from May 2020-May 2021
plot = sns.countplot(data = filtered_data_1, x = "AQI", color = "#FFA200")
plt.title("Overall Count of Unhealthy for Sensitive Groups AQI Values (AQI 100-150) in the USA from May 2020-May 2021")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(100, 150)
plt.figure(figsize = (10, 12))
plt.show()
<Figure size 1000x1200 with 0 Axes>
In [33]:
# bar plot of overall moderate aqi values (50-100) in US from June 2021-May 2022
plot = sns.countplot(data = filtered_data_2, x = "AQI", color = "#FFA200")
plt.title("Overall Count of Unhealthy for Sensitive Groups AQI Values (AQI 100-150) in the USA from June 2021-May 2022")
plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
plt.xlim(100, 150)
plt.figure(figsize = (10, 12))
plt.show()
<Figure size 1000x1200 with 0 Axes>

Distribution of AQI Values (0-150) in Each State from May 2020 to May 2021¶

In [34]:
for state in sorted(['Washington', 'Ohio', 'Georgia', 'Oregon', 'New York', 'New Mexico', 'Texas',
 'Michigan', 'Wisconsin', 'Oklahoma', 'Arkansas', 'Maine', 'California',
 'Maryland', 'Louisiana', 'Nebraska', 'Montana', 'North Dakota', 'Virginia',
 'Illinois', 'Indiana', 'Massachusetts', 'Colorado', 'Kentucky', 'Florida',
 'Utah', 'Iowa', 'West Virginia', 'South Carolina', 'North Carolina',
 'Tennessee', 'New Hampshire', 'Mississippi', 'Missouri', 'Alabama',
 'Minnesota', 'Arizona', 'Pennsylvania', 'Wyoming', 'Hawaii',
 'Idaho', 'Nevada', 'Rhode Island', 'New Jersey',
 'South Dakota', 'Alaska', 'Connecticut', 'Vermont', 'Kansas', 'Delaware']): # for loop of sorted state name
    
    df1 = filtered_data_1[filtered_data_1["state_name"] == state]    
    # bar plot of good aqi values (0-50) in US from May 2020-May 2021
    plot = sns.countplot(data = df1, x = "AQI", color = "#00DB16")
    plt.title(f"Count of Good AQI Values (AQI 0-50) in {state}")
    plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
    plt.xlim(0, 51)
    plt.show()
    
    df2 = filtered_data_1[filtered_data_1["state_name"] == state]
    # bar plot of moderate aqi values (50-100) in US from May 2020-May 2021
    plot = sns.countplot(data = df2, x = "AQI", color = "#F3FC00")
    plt.title(f"Count of Moderate AQI Values (AQI 50-100) in {state}")
    plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
    plt.xlim(50, 100)
    plt.show()
    
    df3 = filtered_data_1[filtered_data_1["state_name"] == state]
    # bar plot of unhealthy aqi values (100-150) in US from May 2020-May 2021
    plot = sns.countplot(data = df3, x = "AQI", color = "#FFA200")
    plt.title(f"Unhealthy for Sensitive Groups AQI Values (AQI 100-150) in {state}")
    plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
    plt.xlim(100, 150)
    plt.show()

Distribution of AQI Values (0-150) in Each State from June 2021 to May 2022¶

In [35]:
for state in sorted(['Washington', 'Ohio', 'Georgia', 'Oregon', 'New York', 'New Mexico', 'Texas',
 'Michigan', 'Wisconsin', 'Oklahoma', 'Arkansas', 'Maine', 'California',
 'Maryland', 'Louisiana', 'Nebraska', 'Montana', 'North Dakota', 'Virginia',
 'Illinois', 'Indiana', 'Massachusetts', 'Colorado', 'Kentucky', 'Florida',
 'Utah', 'Iowa', 'West Virginia', 'South Carolina', 'North Carolina',
 'Tennessee', 'New Hampshire', 'Mississippi', 'Missouri', 'Alabama',
 'Minnesota', 'Arizona', 'Pennsylvania', 'Wyoming', 'Hawaii',
 'Idaho', 'Nevada', 'Rhode Island', 'New Jersey',
 'South Dakota', 'Alaska', 'Connecticut', 'Vermont', 'Kansas', 'Delaware']): # for loop of sorted state name
    
    df4 = filtered_data_2[filtered_data_2["state_name"] == state]    
    # bar plot of good aqi values (0-50) in US from June 2021-May 2022
    plot = sns.countplot(data = df4, x = "AQI", color = "#00DB16")
    plt.title(f"Count of Good AQI Values (AQI 0-50) in {state}")
    plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
    plt.xlim(0, 51)
    plt.show()
    
    df5 = filtered_data_2[filtered_data_2["state_name"] == state]
    # bar plot of moderate aqi values (50-100) in US from June 2021-May 2022
    plot = sns.countplot(data = df5, x = "AQI", color = "#F3FC00")
    plt.title(f"Count of Moderate AQI Values (AQI 50-100) in {state}")
    plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
    plt.xlim(50, 100)
    plt.show()
    
    df6 = filtered_data_2[filtered_data_2["state_name"] == state]
    # bar plot of unhealthy aqi values (100-150) in US from June 2021-May 2022
    plot = sns.countplot(data = df6, x = "AQI", color = "#FFA200")
    plt.title(f"Unhealthy for Sensitive Groups AQI Values (AQI 100-150) in {state}")
    plot.set_xticklabels(plot.get_xticklabels(), fontsize = 9)
    plt.xlim(100, 150)
    plt.show()

Date Extraction and Conversion in Full Dataset from May 2020 to May 2022¶

In [36]:
# convert date to datetime of full_data
full_data.Date = pd.to_datetime(full_data.Date)
# extract year from date of full_data
full_data.insert(2,'Year', full_data.Date.dt.year)
# extract month from date of full_data
full_data.insert(3,'Month', full_data.Date.dt.month)
# extract day from date of full_data
full_data.insert(4,'Day', full_data.Date.dt.day)
C:\Users\Mohammad Navid\AppData\Local\Temp\ipykernel_15060\1187036773.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Unique Years in the Full Dataset from May 2020 to May 2022¶

In [37]:
full_data.Year.unique() # print unique years of full_data
Out[37]:
array([2022, 2021, 2020], dtype=int64)

Function for Plot Labels and X-Tick Rotation¶

In [38]:
def labels(Title, Xlabel=None, Ylabel=None, Rotation= None): #definition of labels for graph title, x and y labels and and rotation of xticks
  """
  This function return title, xlabel, ylabel and rotation of xticks
  """
  plt.title(Title, fontsize= 22)
  plt.xlabel(Xlabel, fontsize= 18)
  plt.ylabel(Ylabel, fontsize= 18)
  plt.xticks(rotation= Rotation);

Count of Values Along Each Year¶

In [39]:
# no. of testing along each year
plt.style.use('dark_background')
sns.countplot(x=full_data.Year, palette='flare')
labels('Number of Values Along Each Year', 'Year', 'No. of Testing', 45)
In [40]:
# total number of values of year 2020, 2021 and 2022
year_2020_total = full_data[full_data['Year'] == 2020].shape[0]
year_2021_total = full_data[full_data['Year'] == 2021].shape[0]
year_2022_total = full_data[full_data['Year'] == 2022].shape[0]

print(f"Total values for 2020: {year_2020_total}")
print(f"Total values for 2021: {year_2021_total}")
print(f"Total values for 2022: {year_2022_total}")
Total values for 2020: 111610
Total values for 2021: 165294
Total values for 2022: 25464

Value Counts of AQI Categories¶

In [41]:
# value counts of aqi category
full_data['Category'].value_counts()
Out[41]:
Good                              234713
Moderate                           61009
Unhealthy for Sensitive Groups      4505
Unhealthy                           1643
Very Unhealthy                       348
Hazardous                            150
Name: Category, dtype: int64

Percentage Distribution of Defining Parameters¶

In [42]:
# bar plot showing percentage of defining parameter (CO, NO2, Ozone, PM2.5, PM10)
pmf = Pmf.from_seq(full_data['Defining Parameter'])
plt.style.use('ggplot')
plt.bar(pmf.index, pmf.values)
labels('Percentage of Defination Parameter', 'Parameter', 'Percentage')

Model Training and Evaluation¶

Linear Regression, Decision Tree Regressor, Random Forest Regressor & Gradient Boosting Regressor¶

In [43]:
# X and y are the input and target variables
X = full_data[['Ozone','PM10','PM2.5']]
y = full_data['AQI']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Linear Regression
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
linear_reg_pred = linear_reg.predict(X_test)

# Train a Decision Tree Regressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)
tree_reg_pred = tree_reg.predict(X_test)

# Train a Random Forest Regressor
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)
forest_reg_pred = forest_reg.predict(X_test)

# Train a Gradient Boosting Regressor
gb_reg = GradientBoostingRegressor()
gb_reg.fit(X_train, y_train)
gb_reg_pred = gb_reg.predict(X_test)

# Function to calculate metrics
def calculate_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    return rmse, mse, mae

# Calculate metrics for each model
models = {
    'Linear Regression': linear_reg_pred,
    'Decision Tree Regressor': tree_reg_pred,
    'Random Forest Regressor': forest_reg_pred,
    'Gradient Boosting Regressor': gb_reg_pred
}

for name, predictions in models.items():
    rmse, mse, mae = calculate_metrics(y_test, predictions)
    print(f"Model Evaluation Metrics for {name}:")
    print(f"RMSE: {rmse:.4f}") # value of root mean squared error (RMSE)
    print(f"MSE: {mse:.4f}") # value of mean squared error (MSE)
    print(f"MAE: {mae:.4f}") # value of mean average error (MAE)
    print("---------------------")
Model Evaluation Metrics for Linear Regression:
RMSE: 2.8401
MSE: 8.0661
MAE: 0.5574
---------------------
Model Evaluation Metrics for Decision Tree Regressor:
RMSE: 1.7845
MSE: 3.1843
MAE: 0.1469
---------------------
Model Evaluation Metrics for Random Forest Regressor:
RMSE: 1.7800
MSE: 3.1683
MAE: 0.1464
---------------------
Model Evaluation Metrics for Gradient Boosting Regressor:
RMSE: 1.9491
MSE: 3.7991
MAE: 0.4949
---------------------
In [ ]: